# -*-encoding:utf-8-*-
import datetime
import re
import requests
import building
from bs4 import BeautifulSoup
import threading
import csv
import time
import random

host='http://newhouse.njhouse.com.cn'
lock = threading.Lock()
datas=[]
threads=[]
req_num=0#当期request请求的数量

def log(data):
	with open('living2.txt', 'a') as f:
		#print data.encoding   .encode('utf8')
		f.write(data+'\r\n')

def GetPageSoup(url):
	'''
	限制当前请求的线程在10条以内，避开反爬
	'''
	global req_num
	while req_num>=10:
		secs=random.uniform(1,3)
		#print('thread %s >> sleep %s'%(threading.current_thread().name,secs)) 
		time.sleep(secs)

	#print('thread %s (%s total)>> is getting page %s'%(threading.current_thread().name,req_num,url.encode('gbk','ignore')))
	lock.acquire()
	try:		
		req_num=req_num+1
	finally:
		lock.release()	

	headers={"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 GTB7.1 (.NET CLR 3.5.30729)", "Referer": "http://www.njhouse.com.cn"}
	r=requests.get(url, stream=True,headers=headers)
	lock.acquire()
	try:		
		req_num=req_num-1
	finally:
		lock.release()
	
	html=r.content.decode(r.encoding,'ignore').encode('utf-8')
	soup=building.BeautifulSoup(html, 'html.parser')

	return soup

class BuildingInfo(threading.local):
	"""docstring for BuildingInfo"""
	def __init__(self, url,last_open_time):
		self.url = url
		self.dic={}
		self.dic['last_open_time']=last_open_time#开盘日期
		self.dic['url']=url

	def GetOverview(self):
		_soup=GetPageSoup(self.dic['url'])
		#self.dic['name_promotion']=_soup.select_one(".a1").get_text()#推广名
		#self.dic['name']=_soup.select_one(".a1").parent.previous_sibling.get_text()#项目名
		self.dic['name_promotion']=_soup.find('font',size='4').get_text()#推广名
		self.dic['name']=_soup.find('font',size='2').get_text().strip('(').strip(')')#项目名
		self.dic['phone']=_soup.find('font',size='3').get_text()#联系电话
		
		t_area=_soup.find_all('table')[19]#项目地址、用途、区属
		self.dic['address']=t_area.find('tr').find_all('td')[1].string#项目地址
		self.dic['project_type']=t_area.find_all('tr')[1].find_all('td')[1].string.strip()#用途
		self.dic['owned']=t_area.find_all('tr')[2].find_all('td')[1].string#区属 
		''''
		building['property']=tables[28].find_all('tr')[0].find_all('td')[1].get_text()#物业
		building['introduction']=tables[32].find_all('tr')[0].find_all('td')[0].get_text()#简介
		'''
		#获取销售明细数据
		_dic1=self.GetSaleDetail(_soup.find_all('iframe')[3]['src'])
		self.dic=dict(self.dic.items()+_dic1.items())

		#return [self.dic]
		#获取预售方案信息
		_list_presales=[]
		_urls=self.GetSaleLicenseUrl(_soup.find_all('iframe')[2]['src'])
		for _url in _urls:
			_l=self.GetPresaleScheme(_url)
			if _l is not None:
				_list_presales+=_l

		if len(_list_presales)==0:
			_list_presales.append(self.dic)
		else:	
			for i,_dic in enumerate(_list_presales):
				_dic=dict(self.dic.items()+_dic.items())
				_list_presales[i]=_dic
		return _list_presales

	def GetSaleDetail(self,iframe_url):
		_dic={}
		_soup=GetPageSoup(iframe_url)#销售均价及数量，iframe下嵌套的一个iframe才是真正的url 
		src=_soup.iframe['src']
		src=src.replace('../..',iframe_url.split('/include/fdc_include',2)[0]).split('&sid=')[0]
		_soup=GetPageSoup(src)
		_dic['count_all']=_soup.body.table.tr.find_all('td')[1].get_text().strip()#纳入网上销售总套数
		t=_soup.body.find_all('table')[1]
		_dic['count_salable']=t.tr.find_all('td')[1].get_text().strip()#可售总套数
		_dic['count_subscription']=t.find_all('tr')[1].find_all('td')[3].get_text().strip()#认购套数
		_dic['count_traded']=t.find_all('tr')[2].find_all('td')[1].get_text().strip()#成交套数
		_dic['count_subscription_today']=t.find_all('tr')[5].find_all('td')[1].get_text().strip()#今日认购套数
		_dic['count_traded_today']=t.find_all('tr')[5].find_all('td')[3].get_text().strip()#今日成交套数
		_dic['price_average']=t.find_all('tr')[3].find_all('td')[1].get_text().strip()#均价
		_dic['price_average_home']=t.find_all('tr')[3].find_all('td')[3].get_text().strip()#住宅类均价
		_dic['price_average_home_month']=t.find_all('tr')[6].find_all('td')[1].get_text().strip()#住宅类当月均价
		_dic['price_average_office']=t.find_all('tr')[4].find_all('td')[1].get_text().strip()#办公类均价
		_dic['price_average_office_month']=t.find_all('tr')[6].find_all('td')[3].get_text().strip()#办公类当月均价
		_dic['price_average_business']=t.find_all('tr')[4].find_all('td')[3].get_text().strip()#商业类均价
		_dic['price_average_business_month']=t.find_all('tr')[7].find_all('td')[1].get_text().strip()#商业类当月均价
		_dic['rate_turnover']=t.find_all('tr')[7].find_all('td')[3].get_text().strip()#楼盘换手率
		return _dic

	def GetPresaleScheme(self,url):
		_dic={'presale_url':url}
		_soup=GetPageSoup(url)

		if _soup.string==u'当前许可证没有预售方案数据。':
			return None
		_sp_num= _soup.select_one('.bodyMsg')
		_dic['presale_num']=(_sp_num.string,'')[_sp_num is not None]#编号

		_sp_dev=_soup.find('span',string=u'开发企业')
		_dic['presale_dev']='' if _sp_dev is None else _sp_dev.next_sibling.string

		_sp_name=_soup.find('span',string=u'项目名称')
		_dic['presale_name']=(_sp_name.next_sibling.string if _sp_name is not None else '')

		_sp_addr=_soup.find('span',string=u'项目地址')
		_dic['presale_addr']=(_sp_addr.next_sibling.string if _sp_addr is not None else '')

		_sp_date=_soup.find('span',string=u'申报日期')
		_dic['presale_date']=(_sp_date.next_sibling.string if _sp_date is not None else '')

		_land_life=_soup.find('td',string=u'土地使用年限')
		_dic['presale_land_life']=(_land_life.next_sibling.string if _land_life is not None else '')

		_plot_ratio=_soup.find('td',string=u'容积率')
		_dic['presale_plot_ratio']=(_plot_ratio.next_sibling.string if _plot_ratio is not None else '')

		_green_rate=_soup.find('td',string=u'绿地率')
		_dic['presale_green_rate']=(_green_rate.next_sibling.string if _green_rate is not None else '')
			 
		_building_density=_soup.find('td',string=u'建筑密度')
		_dic['presale_building_density']=(_building_density.next_sibling.string if _building_density is not None else '')

		_num_total=_soup.find('td',string=u'房屋总幢数')
		_dic['presale_num_total']=(_num_total.next_sibling.string if _num_total is not None else '')

		_num_total_house=_soup.find('td',string=u'其中住宅总幢数')
		_dic['presale_num_total_house']=(_num_total_house.next_sibling.string if _num_total_house is not None else '')

		presale=[]
		table=_soup.select('.Demo')[1]#本次申请预售项目基本情况
		if table is not None and table.find('th',string='期数') is not None:
			for tr in table.tbody.find_all('tr'):
				if len(tr.find_all('td'))>=7:
					_dic_base={}
					_dic_base['presale_building_period']=tr.find_all('td')[0].string
					_dic_base['presale_building_no']=tr.find_all('td')[1].string
					_dic_base['presale_building_usage']=tr.find_all('td')[2].string
					_dic_base['presale_building_amount']=tr.find_all('td')[4].string
					_dic_base['presale_building_decoration']=tr.find_all('td')[5].string
					_dic_base['presale_duilding_delivery_time']=tr.find_all('td')[6].string
					_dic_base['presale_duilding_avg_price']=tr.find_all('td')[7].string
					_dic_base=dict(_dic.items()+_dic_base.items())
					presale.append(_dic_base)

		return presale

	def GetSaleLicenseUrl(self,iframe_url):
		_soup=GetPageSoup(iframe_url)
		_div=_soup.select_one('.scroll')
		_urls=[]
		for a in _div.find_all('a',string='预售方案'):
			_urls.append(a['href'])
		return _urls

	def __str__(self):
		return 'self'

def GetBuildings():
	soup=GetPageSoup(host+'/kpgg/')
	tables= soup.body.find_all('table')
	areas=tables[2]#代表区域的table
	building1=tables[4]#第一块代表住宅的table
	building2=tables[5]#第二块代表住宅的table

	buildings=[]
	flag=False
	for tr in building1.find_all('tr',valign="top")+building2.find_all('tr',valign="top"):
		tds=tr.find_all('td')
		b={}
		b['last_open_time']=tr.find('td').string#开盘日期		
		b['name']=tds[1].get_text().strip()#项目名称
		b['name_promote']=tds[2].get_text().strip()#项目推广名
		b['project_type']=tds[3].get_text().strip()#项目类别
		b['address']=tds[4].get_text().strip()#项目地址
		b['phone']=tds[5].get_text().strip()#联系电话
		b['url']=tds[1].find('a').get('href').replace('..',host)#项目链接
		if datetime.datetime.strptime(b['last_open_time'], "%Y.%m.%d").date().year<2017:
			break
		'''	if b['name']==u'万锦花园':# or b['name']==u'万锦花园':
			buildings.append(b)
			break
		else:
			continue
		'''
		''''''
		buildings.append(b)
	return buildings

def run_thread(last_open_time,url):
	t_name=threading.current_thread().name
	print('thread %s >> begin'%(t_name)) 
	
	_list=[]
	_list=BuildingInfo(url,last_open_time).GetOverview()
	'''try:
		_list=BuildingInfo(url,last_open_time).GetOverview()
	except Exception as e:
		print e
		print('thread %s >> exception '%(t_name))
		log('thread %s >> error : '%(t_name)+str(e))
		return
'''
	global datas
    # 先要获取锁
	lock.acquire()
	try:		
		datas+=_list
	finally:
		lock.release()
	print('thread %s >> end'%(t_name))

def SaveDatas():
	#首次创建文件并写入中文编码,后续以追加方式写入文本
	#已带b（文件模式）的模式打开csv文件可避免writerow方法占两行的情况（自动在行末尾添加两个换行符）
	with open('some.csv', 'wb') as f:
		f.write(u'\ufeff'.encode('utf8'))

	titles=[u'开盘日期',u'项目名',u'推广名',u'区属 ',u'用途',u'项目地址',u'联系电话',u'项目链接']
	titles+=[u'纳入网上销售总套数',u'可售总套数',u'认购套数',u'成交套数',u'今日认购套数',u'今日成交套数',u'均价',u'住宅类均价',u'住宅类当月均价',u'办公类均价',u'办公类当月均价',u'商业类均价',u'商业类当月均价',u'楼盘换手率']
	titles+=[u'预售链接',u'预售编号',u'开发企业',u'项目名称',u'项目地址',u'申报日期',u'土地使用年限',u'容积率',u'绿地率',u'建筑密度',u'房屋总幢数',u'其中住宅总幢数']
	titles+=[u'期数',u'幢号',u'用途',u'套数(间、个)',u'装修类型',u'拟交付时间',u'拟销售均价(元/㎡,车位：万元/个)']

	with open('some.csv', 'ab+') as f:
		writer = csv.writer(f)
		writer.writerow([title.encode('utf8') for title in titles])

	global datas
	for dic in datas:
		_list=[]
		#log(key+'    '+dic[key].encode('utf8'))		
		#基础信息
		_list.append(dic['last_open_time'])#u'开盘日期'
		_list.append(dic['name'])#u'项目名'
		_list.append(dic['name_promotion'])#u'推广名'
		_list.append(dic['owned'])#u'区属 '
		_list.append(dic['project_type'])#u'用途'
		_list.append(dic['address'])#u'项目地址'
		_list.append(dic['phone'])#u'联系电话'
		_list.append(dic['url'])#u'项目链接'
		#楼盘销售信息
		_list.append(dic['count_all'])#u'纳入网上销售总套数'
		_list.append(dic['count_salable'])#u'可售总套数'
		_list.append(dic['count_subscription'])#u'认购套数'
		_list.append(dic['count_traded'])#u'成交套数'
		_list.append(dic['count_subscription_today'])#u'今日认购套数'
		_list.append(dic['count_traded_today'])#u'今日成交套数'
		_list.append(dic['price_average'])#u'均价'
		_list.append(dic['price_average_home'])#u'住宅类均价'
		_list.append(dic['price_average_home_month'])#u'住宅类当月均价'
		_list.append(dic['price_average_office'])#u'办公类均价'
		_list.append(dic['price_average_office_month'])#u'办公类当月均价'
		_list.append(dic['price_average_business'])#u'商业类均价'
		_list.append(dic['price_average_business_month'])#u'商业类当月均价'
		_list.append(dic['rate_turnover'])#u'楼盘换手率'		
		#预售信息
		_list.append(dic['presale_url'] if dic.has_key('presale_url') else '')#u'预售链接'
		_list.append(dic['presale_num'] if dic.has_key('presale_num') else '')#u'预售编号'
		_list.append(dic['presale_dev'] if dic.has_key('presale_dev') else '')#u'开发企业'
		_list.append(dic['presale_name'] if dic.has_key('presale_name') else '')#u'项目名称'
		_list.append(dic['presale_addr'] if dic.has_key('presale_addr') else '')#u'项目地址'
		_list.append(dic['presale_date'] if dic.has_key('presale_date') else '')#u'申报日期'
		_list.append(dic['presale_land_life'] if dic.has_key('presale_land_life') else '')#u'土地使用年限'
		_list.append(dic['presale_plot_ratio'] if dic.has_key('presale_plot_ratio') else '')#u'容积率'
		_list.append(dic['presale_green_rate'] if dic.has_key('presale_green_rate') else '')#u'绿地率'
		_list.append(dic['presale_building_density'] if dic.has_key('presale_building_density') else '')#u'建筑密度'
		_list.append(dic['presale_num_total'] if dic.has_key('presale_num_total') else '')#u'房屋总幢数'
		_list.append(dic['presale_num_total_house'] if dic.has_key('presale_num_total_house') else '')#u'其中住宅总幢数'
		#本次申请预售项目基本情况
		_list.append(dic['presale_building_period'] if dic.has_key('presale_building_period') else '')#u'期数'		
		_list.append(dic['presale_building_no'] if dic.has_key('presale_building_no') else '')#u'幢号'		
		_list.append(dic['presale_building_usage'] if dic.has_key('presale_building_usage') else '')#u'用途'		
		_list.append(dic['presale_building_amount'] if dic.has_key('presale_building_amount') else '')#u'套数(间、个)'
		_list.append(dic['presale_building_decoration'] if dic.has_key('presale_building_decoration') else '')#u'装修类型'		
		_list.append(dic['presale_duilding_delivery_time'] if dic.has_key('presale_duilding_delivery_time') else '')#u'拟交付时间'	
		_list.append(dic['presale_duilding_avg_price'] if dic.has_key('presale_duilding_avg_price') else '')#u'拟销售均价(元/㎡,车位：万元/个)'

		with open('some.csv', 'ab+') as f:
			writer = csv.writer(f)
			writer.writerow(['' if content is None else content.encode('utf8') for content in _list])

def Run():
	buildings=GetBuildings()
	for b in buildings:
		t=threading.Thread(target=run_thread,name='t_'+b['name'].encode('gbk','ignore'),args=(b['last_open_time'],b['url']))
		threads.append(t)
		t.start()
	for t in threads:
		t.join()

	print 'datas.length = '+str(len(datas))
	SaveDatas()

Run()
'''


r=requests.get('http://www.njhouse.com.cn/spf/permit/permit.php?id=119854', stream=True)
html=r.content.decode(r.encoding).encode('utf-8')
_soup=building.BeautifulSoup(html, 'html.parser')
print _soup.string

#_soup=GetPageSoup('http://www.njhouse.com.cn/spf/permit/permit.php?id=119761')'''
'''
_sp_num= _soup.select_one('.bodyMsg')
 

print _soup.find_all('td')[59].parent.parent
log(_soup.find_all('td')[59].parent.parent.prettify())
	

_land_life=_soup.find('td',string='土地使用年限')
if _land_life is not None:
	print _land_life.next_sibling.string

_plot_ratio=_soup.find('td',string='容积率')
if _plot_ratio is not None:
	print _plot_ratio.next_sibling.string
_green_rate=_soup.find('td',string='绿地率')
if _green_rate is not None:
	print _green_rate.next_sibling.string
_building_density=_soup.find('td',string='建筑密度')
if _building_density is not None:
	print _building_density.next_sibling.string
_num_total=_soup.find('td',string='房屋总幢数')
if _num_total is not None:
	print _num_total.next_sibling.string
_num_total_house=_soup.find('td',string='其中住宅总幢数')
if _num_total_house is not None:
	print _num_total_house.next_sibling.string
#print _soup.select_one('#page3').prettify()#.find('table').next_sibling.prettify()


'''
'''
soup =GetPageSoup('http://newhouse.njhouse.com.cn/detail.php?id=1792')
ts=soup.find_all('table')[19]
print ts.prettify('gbk')
print ts.find('tr').find_all('td')[1].string
print ts.find_all('tr')[1].find_all('td')[1].string.strip()
print ts.find_all('tr')[2].find_all('td')[1].string

print soup.find('font',size='4').get_text()#推广名
print soup.find('font',size='2').get_text().strip('(').strip(')')#项目名
print soup.find('font',size='3').get_text()#项目名

'''
